Quantifying relevance: Data exploration write-up

Author

Michael Franke

Code
library(tidyverse)
library(brms)
library(tidyboot)
library(tidyjson)
library(patchwork)
library(GGally)
library(cowplot)
library(BayesFactor)
library(aida)   # custom helpers: https://github.com/michael-franke/aida-package
library(faintr) # custom helpers: https://michael-franke.github.io/faintr/index.html

##################################################

# these options help Stan run faster
options(mc.cores = parallel::detectCores())

# use the aida-theme for plotting
theme_set(theme_aida())

# global color scheme / non-optimized
project_colors = c("#E69F00", "#56B4E9", "#009E73", 
                   "#F0E442", "#0072B2", "#D55E00", 
                   "#CC79A7", "#000000")

# setting theme colors globally
scale_colour_discrete <- function(...) {
  scale_colour_manual(..., values = project_colors)
}
scale_fill_discrete <- function(...) {
  scale_fill_manual(..., values = project_colors)
} 

##################################################

rerun_models <- FALSE 

Read, inspect & massage the data

The cleaned and pre-processed data comes in JSON format, and contains nested lists, which we here spread out fully into a regular tibble.

Code
# read cleaned data from JSON-lines file and cast to tibble
d <- read_json("../results/round_1.0/results_processed.jsonl") %>% 
  spread_all() %>% 
  select(-..JSON, -document.id) %>% as_tibble() %>% 
  # set "non-answers" to AnswerPolarity "positive"
  mutate(AnswerPolarity = ifelse(AnswerCertainty == "non_answer", "positive", AnswerPolarity))

# add information from (nested) JSON entries on beta-parameters for prior and posterior
prior_beta <- read_json("../results/round_1.0/results_processed.jsonl") %>% 
  enter_object(prior_beta) %>% gather_array() %>% 
  mutate(parameter = ifelse(array.index == 1, "prior_beta_a", "prior_beta_b"),
         value = as.numeric(..JSON)) %>% 
  pivot_wider(id_cols = document.id, names_from = parameter, values_from = value)
posterior_beta <- read_json("../results/round_1.0/results_processed.jsonl") %>% 
  enter_object(posterior_beta) %>% gather_array() %>% 
  mutate(parameter = ifelse(array.index == 1, "posterior_beta_a", "posterior_beta_b"),
         value = as.numeric(..JSON)) %>% 
  pivot_wider(id_cols = document.id, names_from = parameter, values_from = value)

# bind tibbles together
d <- cbind(
  d,
  prior_beta %>% select(-document.id), 
  posterior_beta %>% select(-document.id)
)

Here’s a glimpse at the data:

Code
glimpse(d)
Rows: 704
Columns: 27
$ submission_id            <dbl> 3010, 3010, 3010, 3010, 3010, 3010, 3010, 301…
$ group                    <chr> "relevant", "relevant", "relevant", "relevant…
$ StimID                   <dbl> 1, 2, 4, 5, 6, 7, 10, 11, 12, 1, 2, 3, 5, 6, …
$ AnswerCertainty          <chr> "low_certainty", "low_certainty", "exhaustive…
$ AnswerPolarity           <chr> "negative", "negative", "negative", "positive…
$ ContextType              <chr> "positive", "neutral", "neutral", "neutral", …
$ posterior_confidence     <dbl> 5, 5, 4, 6, 3, 5, 6, 7, 7, 6, 7, 6, 3, 3, 3, …
$ prior_confidence         <dbl> 5, 5, 5, 5, 5, 3, 4, 7, 5, 6, 5, 2, 3, 2, 3, …
$ posterior_sliderResponse <dbl> 0.66, 0.20, 0.01, 0.80, 0.45, 0.05, 0.95, 0.9…
$ prior_sliderResponse     <dbl> 0.80, 0.75, 0.05, 0.70, 0.74, 0.10, 0.15, 0.7…
$ relevance_sliderResponse <dbl> 0.35, 0.40, 0.66, 1.00, 0.25, 0.45, 1.00, 1.0…
$ prior_concentration      <dbl> 87.75999, 87.75999, 87.75999, 87.75999, 87.75…
$ posterior_concentration  <dbl> 87.75999, 87.75999, 49.08459, 156.90904, 27.4…
$ kl                       <dbl> 0.07710939, 0.96107940, 0.03568672, 0.0371235…
$ kl_util                  <dbl> 0.16268165, 0.89062436, 0.07888622, 0.0819286…
$ entropy_reduction        <dbl> 2.028906e-01, 8.935003e-02, 2.056038e-01, 1.5…
$ bayes_factor             <dbl> 0.3139950, 1.0791812, 0.7168816, 0.2340832, 0…
$ exp_bayes_factor         <dbl> 0.5147059, 0.9166667, 0.8080808, 0.4166667, 0…
$ posterior_distance       <dbl> 0.32, 0.60, 0.98, 0.60, 0.10, 0.90, 0.90, 0.8…
$ prior_posterior_distance <dbl> 0.14, 0.55, 0.04, 0.10, 0.29, 0.05, 0.80, 0.2…
$ kl_beta                  <dbl> 4.0284863, 59.1370597, 1.8244812, 2.5182371, …
$ kl_util_beta             <dbl> 0.9387220, 1.0000000, 0.7176574, 0.8254439, 0…
$ entropy_reduction_beta   <dbl> -0.16564407, 0.07752212, 0.23889078, 0.423001…
$ prior_beta_a             <dbl> 69.607993, 65.319994, 5.288000, 61.031994, 64…
$ prior_beta_b             <dbl> 18.151998, 22.439998, 82.471992, 26.727997, 2…
$ posterior_beta_a         <dbl> 57.601594, 18.151998, 1.470846, 124.927229, 1…
$ posterior_beta_b         <dbl> 30.158397, 69.607993, 47.613747, 31.981807, 1…

Here’s a list of what each column represents:

$submission_id
[1] "ID for each participant"

$group
[1] "whether the participant saw trigger word 'helpful' or 'relevant'"

$StimID
[1] "ID for each stimulus (vignette)"

$AnswerCertainty
[1] "whether shown answer in trial was 'exhaustive', 'high_certainty', 'low_certainty' or 'non_answer'"

$AnswerPolarity
[1] "whether shown answer in trial was 'positive' (suggesting yes) or 'negative' (suggestion no)"

$ContextType
[1] "whether the vignette was realized as making a 'no' answer more likely ('negative'), or a 'yes' answer ('positive') or neither ('neutral')"

$posterior_confidence
[1] "slider rating indicating confidence in the corresponding posterior rating"

$prior_confidence
[1] "slider rating indicating confidence in the corresponding prior rating"

$posterior_SliderResponse
[1] "slider rating indicating posterior belief (= after the answer)"

$prior_SliderResponse
[1] "slider rating indicating prior belief (= before the answer)"

$relevance_sliderResponse
[1] "slider rating indicating how 'relevant' or 'helpful' the answer was (trigger word depends on 'group' variable)"

$prior_concentration
[1] "inferred concetration parameter of a beta distribution representing the participants prior and prior confidence ratings"

$posterior_concentration
[1] "inferred concetration parameter of a beta distribution representing the participants posterior and posterior confidence ratings"

[[14]]
[1] "kl = KL divergence from posterior (true distribution) to prior (approximate distribution)"

$kl_util
[1] "scaled KL divergence as 1 - (10^{-kl})"

$entropy_reduction
[1] "abs(Entropy(prior) - Entropy(posterior))"

$bayes_factor
[1] "abs(log(BF)) where BF is the Bayes factor, computed as posterior-odds / prior-odds (based on ratings given for prior and posterior)"

$exp_bayes_factor
[1] "scaled Bayes factor, computed as 1 - (10^{-bayes_factor})"

$posterior_distance
[1] "distance of posterior from uniform distribution, calculated as 2 * abs(0.5 - prior)"

$prior_posterior_distance
[1] "distance between prior and posterior, calculated as abs(prior - posterior)"

$kl_beta
[1] "KL divergence between the higher-order prior & posterior distributions, which are computed from the prior/posterior slider ratings and the corresponding confidence ratings"

$kl_util_beta
[1] "scaled kl_beta computed as 2^{-kl_beta}"

$entropy_reduction_beta
[1] "Entropy(higher-order prior) - Entropy(higher-order posterior)"

$prior_beta_a
[1] "alpha parameter of the (inferred) beta distribution describing the participants prior beliefs, based on ratings for 'prior' and 'prior_confidence'"

$prior_beta_b
[1] "beta parameter of the (inferred) beta distribution describing the participants prior beliefs, based on ratings for 'prior' and 'prior_confidence'"

$posterior_beta_a
[1] "alpha parameter of the (inferred) beta distribution describing the participants posterior beliefs, based on ratings for 'posterior' and 'posterior_confidence'"

$posterior_beta_b
[1] "beta parameter of the (inferred) beta distribution describing the participants posterior beliefs, based on ratings for 'posterior' and 'posterior_confidence'"

For convenience, a bit of renaming and re-leveling:

Code
d <- d %>% 
  mutate(
    "trigger word" = factor(group, levels = c("relevant", "helpful")),
    "relevance rating" = relevance_sliderResponse,
    ContextType = factor(ContextType, 
                         levels = c("negative", "neutral", "positive")),
    AnswerPolarity = factor(AnswerPolarity, 
                            levels = c("positive", "negative")),
    AnswerPolarity = factor(AnswerPolarity, 
                            levels = c("positive", "negative")),
    AnswerCertainty = factor(AnswerCertainty, 
                             levels = c("non_answer", "low_certainty", 
                                        "high_certainty", "exhaustive"))
  ) 

Number of participants:

[1] 71

Exploring the effect of the experimental factors

The experiment had the following factors:

  • group or trigger word: whether a participant rated the ‘helpfulness’ or the ‘relevance’ of the answer (between-subjects variable)
  • ContextType: whether the context made a ‘no’ or a ‘yes’ answer more likely /a priori/ or whether it was neutral (within-subjects)
  • AnswerCertainty: how much information the answer provides towards a fully resolving answer (within-subjects)
  • AnswerPolarity: whether the answer suggests or implies a ‘no’ or a ‘yes’ answer (within-subjects)
    • ‘non-answers’ are treated as ‘positive’ for technical purposes, but this does not influence relevant analyses

In the following, we first check whether these experimental manipulations worked as intended.

Sanity-checking whether the manipulations worked as intended

Effects of ContextType on prior and prior confidence

To check whether the ContextType manipulation worked, we compare how participants rated the prior probability of a ‘yes’ answer under each level of the ContextType variable. Concretely, we expect this order of prior ratings for the levels of ContextType: negative < neutral < positive. Although we have no specific hypotheses or sanity-checking questions regarding the confidence ratings, let’s also scrutinize the confidence ratings that participants gave with their prior ratings.

Prior ratings as a function of ContextType

Here is a first plot addressing the question after an effect of ContextType on participants prior ratings.

Code
d %>% ggplot(aes(x = prior_sliderResponse, color = ContextType, fill = ContextType)) +
  geom_density(alpha = 0.3) + 
  xlab("prior rating") +
  ylab("")

We dive deeper by fitting a regression model, predicting prior ratings in terms of the ContextType. Since participants have not seen the answer when they rate the prior probability of a ‘yes’ answer, ContextType is the only fixed effect we should include here. The model also includes the maximal RE structure.

Code
if (rerun_models) {
  fit_contextType_SC <- brm(
    prior_sliderResponse ~ ContextType + 
      (1 + ContextType | StimID) + 
      (1 + ContextType | submission_id),
    data = d
  )
  saveRDS(fit_contextType_SC, "cachedModels/fit_contextType_SC.Rds")
} else{
  fit_contextType_SC <- readRDS("cachedModels/fit_contextType_SC.Rds")
}

Our assumption is that prior ratings are higher in contexts classified as ‘neutral’ than in ‘negative’ contexts, and yet higher in ‘positive’ contexts. We use the faintr package to extract information on these directed comparisons.

Code
ContextType_SC_negVSneu <- 
  compare_groups(
    fit_contextType_SC, 
    lower = ContextType == "negative",
    higher = ContextType == "neutral" 
  )
ContextType_SC_neuVSpos <- 
  compare_groups(
    fit_contextType_SC, 
    lower = ContextType == "neutral",
    higher = ContextType == "positive"
  )

Prior confidence as a function of ContextType

Here is a visualization of the effect of ContextType on participants’ confidence in their prior ratings.

Code
d %>% ggplot(aes(x = prior_confidence, color = ContextType, fill = ContextType)) +
  geom_density(alpha = 0.3) + 
  xlab("prior confidence") +
  ylab("")

To scrutinize the effect of ContextType on participants expressed confidence in their prior ratings, we use a similar regression model and gather information about the contrast which -upon visual inspection- seem most likely to be meaningful.

Code
if (rerun_models) {
  fit_contextType_SC_conf <- brm(
    prior_confidence ~ ContextType + 
      (1 + ContextType | StimID) + 
      (1 + ContextType | submission_id),
    data = d
  )
  saveRDS(fit_contextType_SC_conf, "cachedModels/fit_contextType_SC_conf.Rds")
} else{
  fit_contextType_SC_conf <- readRDS("cachedModels/fit_contextType_SC_conf.Rds")
}
Code
ContextType_SC_neuVSneg_conf <- 
  compare_groups(
    fit_contextType_SC_conf, 
    lower = ContextType == "neutral",
    higher = ContextType == "negative" 
  )
print(ContextType_SC_neuVSneg_conf)
Outcome of comparing groups: 
 * higher:  ContextType == "negative" 
 * lower:   ContextType == "neutral" 
Mean 'higher - lower':  0.3523 
95% HDI:  [ 0.0742 ; 0.6352 ]
P('higher - lower' > 0):  0.9895 
Posterior odds:  94.24 
Code
ContextType_SC_negVSpos_conf <- 
  compare_groups(
    fit_contextType_SC_conf, 
    lower = ContextType == "negative",
    higher = ContextType == "positive"
  )
print(ContextType_SC_negVSpos_conf)
Outcome of comparing groups: 
 * higher:  ContextType == "positive" 
 * lower:   ContextType == "negative" 
Mean 'higher - lower':  0.3449 
95% HDI:  [ -0.05677 ; 0.7291 ]
P('higher - lower' > 0):  0.9582 
Posterior odds:  22.95 

Results

The results of these comparisons are summarized here:

comparison measure posterior 95% HDI (low) 95% HDI (high)
negative < neutral prior 0.99650 0.0619778 0.3064931
neutral < positive prior 0.99850 0.1097723 0.3631348
neutral < negative prior-confidence 0.98950 0.0741972 0.6352106
negative < positive prior-confidence 0.95825 -0.0567746 0.7290585

The ContextType manipulation seems to have worked as expected for the prior ratings: lower in ‘negative’ than in ‘neutral’ than in ‘positive’. There are also differences in the confidence ratings, with lowest confidence in the ‘neutral’ contexts (which makes sense), but no indication of a difference between ‘negative’ having less confidence than ‘positive’.

Further exploration

Finally, this is an interesting plot, showing the relation between prior ratings and prior confidence, also in dependence on ContextType. We see that, unsurprisingly, the value 0.5 for prior rattings has an exceptional status, attracting a lot of data points from the whole range of prior confidence. Ignoring this value, there is a seemingly U-shaped curve, suggesting a trend to be more confident in ratings further away from 0.5. [We could speak of Laplacian confidence: total confidence that we have absolutely no idea, hence prior=0.5. That level of confidence is not attainable, it seems, for 0.49, unless we’d have very specific information.]

Code
d %>% ggplot(aes(x = prior_sliderResponse, 
                 y = prior_confidence, 
                 color = ContextType)) +
  geom_jitter(height = 0.3, width =0, alpha = 0.7) +
  # geom_point(alpha = 0.7) +
  xlab("prior rating") +
  ylab("prior confidence") 

We also check this for the posterior:

Code
d %>% ggplot(aes(x = posterior_sliderResponse, 
                 y = posterior_confidence, 
                 color = ContextType)) +
  geom_jitter(height = 0.3, width =0, alpha = 0.7) +
  # geom_point(alpha = 0.7) +
  xlab("posterior rating") +
  ylab("posterior confidence") 

Effects of AnswerPolarity and AnswerCertainty on beliefChange

We can define beliefChange as the difference between posterior and prior in the direction expected from the answer’s polarity (posterior belief in ‘yes’ answer increases for a ‘positive’ answer when compared with the prior rating, but it decreases for ‘negative’ answers). (Careful: we ignore non-answers (which are categorized as “positive” for technical convenience only).) If our manipulation worked, we expect the following for both ‘positive’ and ‘negative’ polarity:

  1. beliefChange is > 0
  2. beliefChange is lower for ‘low certainty’ than for ‘high certainty’ than for ‘exhaustive’

Here is a plot visually addressing these issues:

Code
d %>% filter(AnswerCertainty != "non_answer") %>% 
  mutate(beliefChange = posterior_sliderResponse - prior_sliderResponse,
         beliefChange = ifelse(AnswerPolarity == "positive", beliefChange, - beliefChange)) %>% 
  ggplot(aes(x = beliefChange, color = AnswerCertainty, fill = AnswerCertainty)) +
  geom_density(alpha = 0.3) +
  facet_grid(AnswerPolarity ~ AnswerCertainty) +
  xlab("belief change (in expected direction)") +
  ylab("") + theme_aida()

beliefChange is positive

To adress the first issue, whether beliefChange is positive for both types of polartiy, we first regress beliefChange against the full list of potentially relevant factors, including plausible RE structures. Notice that at the time of answer the questions related to the posterior, participants have not yet seen the question after relevance or helpfulness, so that factor group (equivalently: trigger word) should be ommitted.

Code
if (rerun_models) {
  fit_answer_SC <- brm(
    formula = beliefChange ~ ContextType * AnswerCertainty * AnswerPolarity +
      (1 + ContextType + AnswerCertainty + AnswerPolarity | StimID) +
      (1 + ContextType + AnswerCertainty + AnswerPolarity | submission_id),
    data = d %>% filter(AnswerCertainty != "non_answer") %>% 
      mutate(beliefChange = posterior_sliderResponse - prior_sliderResponse,
             beliefChange = ifelse(AnswerPolarity == "positive", beliefChange, - beliefChange))
  )
  saveRDS(fit_answer_SC, "cachedModels/fit_answer_SC.Rds")
} else{
  fit_answer_SC <- readRDS("cachedModels/fit_answer_SC.Rds")
}

We check if inferred cell means are credibly bigger than zero, for all six relevant design cells (facets in the plot above).

Code
# 1. Check if belief change in each cell is bigger than zero
cellDraws_answers <- tibble(
  extract_cell_draws(fit_answer_SC, AnswerCertainty == "low_certainty" & AnswerPolarity == "positive", "low_pos"),
  extract_cell_draws(fit_answer_SC, AnswerCertainty == "high_certainty" & AnswerPolarity == "positive", "high_pos"),
  extract_cell_draws(fit_answer_SC, AnswerCertainty == "exhaustive"    & AnswerPolarity == "positive", "exh_pos"),
  extract_cell_draws(fit_answer_SC, AnswerCertainty == "low_certainty" & AnswerPolarity == "negative", "low_neg"),
  extract_cell_draws(fit_answer_SC, AnswerCertainty == "high_certainty" & AnswerPolarity == "negative", "high_neg"),
  extract_cell_draws(fit_answer_SC, AnswerCertainty == "exhaustive"    & AnswerPolarity == "negative", "exh_neg")
) 

# all posterior 95% HDIs are wayquire above 0 
cellDraws_answers %>% as.matrix() %>% 
  apply(., 2, aida::summarize_sample_vector)
$low_pos
# A tibble: 1 × 4
  Parameter `|95%`  mean `95%|`
  <chr>      <dbl> <dbl>  <dbl>
1 ""        0.0776 0.134  0.189

$high_pos
# A tibble: 1 × 4
  Parameter `|95%`  mean `95%|`
  <chr>      <dbl> <dbl>  <dbl>
1 ""         0.175 0.234  0.292

$exh_pos
# A tibble: 1 × 4
  Parameter `|95%`  mean `95%|`
  <chr>      <dbl> <dbl>  <dbl>
1 ""         0.365 0.447  0.519

$low_neg
# A tibble: 1 × 4
  Parameter `|95%`  mean `95%|`
  <chr>      <dbl> <dbl>  <dbl>
1 ""         0.114 0.215  0.315

$high_neg
# A tibble: 1 × 4
  Parameter `|95%`  mean `95%|`
  <chr>      <dbl> <dbl>  <dbl>
1 ""         0.196 0.300  0.398

$exh_neg
# A tibble: 1 × 4
  Parameter `|95%`  mean `95%|`
  <chr>      <dbl> <dbl>  <dbl>
1 ""         0.221 0.324  0.423
Code
# posterior probability of mean bigger 0 for each cell is almost 1 everywhere
as.matrix(cellDraws_answers) %>% 
  apply(., 2, function(x) {mean(x>0)})
 low_pos high_pos  exh_pos  low_neg high_neg  exh_neg 
       1        1        1        1        1        1 

These results suggest that there is little reason to doubt that the belief changes induces by the answers -as per the experimentally intended manipulation- went in the right direction in all cases.

Code
# TODO make a plot of the posterior cell draws, include 95% HDIs there (more perspicuous than the numbers)

beliefChange increases with more informative answers

Finally, we investigate whether beliefChange increases with more informative answers, using the same regression model as before.

Code
AnswerPolarity_main <- compare_groups(
  fit_answer_SC,
  lower = AnswerPolarity == "positive",
  higher  = AnswerPolarity == "negative"
)

AnswerCertainty_lowVShigh <- compare_groups(
  fit_answer_SC,
  lower   = AnswerCertainty == "low_certainty",
  higher  = AnswerCertainty == "high_certainty"
)

AnswerCertainty_highVSexh <- compare_groups(
  fit_answer_SC,
  lower   = AnswerCertainty == "high_certainty",
  higher  = AnswerCertainty == "exhaustive"
)
comparison measure posterior 95%HDI (low) 95%HDI (high)
pos vs neg polarity belief change 0.57400 -0.0838574 0.1053015
low vs high certainty belief change 0.99725 0.0404434 0.1512239
high certain vs exh belief change 0.99350 0.0275226 0.2065816

We see no indication of a main effect of polarity, but find support for the claim that our manipulation of AnswerCertainty induced gradually larger belief changes. I sum, it seems that the stimuli were adequately created to implement the intended manipulation in the variables AnswerCertainty and AnswerPolarity.

Exploring the “no belief change cases”

A “no belief change” case is a trial in which the prior judgement equals the posterior judgement and also the ratings of confidence in each are exactly the same. These are expected to happen only for ‘non-answers’ or for cases where the prior was already extreme (in the direction of the polarity of the answer). It should also be checked if individual participants are particularly prone to indicating no belief change at all.

Code
d %>% mutate(
  noBeliefChange = case_when(
    prior_sliderResponse == posterior_sliderResponse &
      prior_confidence == posterior_confidence ~ TRUE,
    TRUE ~ FALSE
  )
) %>%
  select(submission_id, AnswerCertainty, prior_sliderResponse, posterior_sliderResponse ,
      prior_confidence, posterior_confidence, noBeliefChange) %>% 
  knitr::kable()
Code
# TODO this code is currently not executed, because the output is quite large
# TODO check if this looks okay; any individuals particularly prone to no-belief-changes?

Predicting relevance in terms of the experimental factors

To see how participants’ relevance ratings depend on the experimental manipulations, we run a linear regression model predicting the former in terms of the latter:

Code
# I'm ommitting interactions in the REs because of the unbalanced (factorial) design
# TODO rethink this
formula = relevance_sliderResponse ~ group * ContextType * AnswerCertainty * AnswerPolarity + 
  (1 + group + ContextType + AnswerCertainty + AnswerPolarity | StimID) + 
  (1 + ContextType + AnswerCertainty + AnswerPolarity | submission_id)

# priors must be specified b/c with improper priors posterior is improper as well
prior = prior(student_t(1, 0, 1), class = "b")

if (rerun_models) {
  fit <- brm(
    formula = formula,  
    iter = 4000,
    prior = prior,
    data = d
  )
  saveRDS(fit, "cachedModels/fit.Rds")
} else{
  fit <- readRDS("cachedModels/fit.Rds")
}

Can we gloss over the different trigger words?

To simplify analyses, it would be helpful to know whether we can gloss over the trigger word manipulation. So, does it matter whether participants were asked to rate relevance or helpfulness?

To start with, let’s just look at whether there is a main effect, which there is not (possibly also partially explained away by by-subject random slopes):

Code
# main effect of "group" ?
group_main <- compare_groups(
  fit,
  higher = group == "relevant",
  lower  = group == "helpful"
)
print(group_main)

Alternatively, we may compare models with and without the group factors. Starting with simple Bayes factors (no REs):

Code
BFComp_group_full <- lmBF(relevance_sliderResponse ~ group * ContextType * AnswerCertainty * AnswerPolarity,
  data = as.data.frame(d))
BFComp_group_noGroup <- lmBF(relevance_sliderResponse ~ ContextType * AnswerCertainty * AnswerPolarity,
  data = as.data.frame(d))

We find that the model without group has a very high Bayes factor in its favor:

Code
BFComp_group_noGroup / BFComp_group_full
Bayes factor analysis
--------------
[1] ContextType * AnswerCertainty * AnswerPolarity : 925.514 ±6.21%

Against denominator:
  relevance_sliderResponse ~ group * ContextType * AnswerCertainty * AnswerPolarity 
---
Bayes factor type: BFlinearModel, JZS

This also shows in this plot:

Code
model_names <- c("full", "lesioned")

tibble(
  full      = BFComp_group_full %>% as.vector(), 
  lesioned  = BFComp_group_noGroup %>% as.vector(),
) %>% pivot_longer(everything(), names_to = "model", values_to = "relative_BF") %>% 
  mutate(
    # model = fct_reorder(model, relative_BF)
    model = factor(model, rev(model_names))
  ) %>% 
  ggplot(aes(x = model, y = relative_BF)) +
  scale_y_log10() +
  geom_col() +
  coord_flip() +
  xlab("") +
  ylab("log BF (relative to intercept only)")

Finally, let’s also compare with LOO cross-validation. We first run the lesioned model:

Code
if (rerun_models) {
  fit_noGroup <- brm(
    formula = relevance_sliderResponse ~ ContextType * AnswerCertainty * AnswerPolarity + 
      (1 + ContextType + AnswerCertainty + AnswerPolarity | StimID) + 
      (1 + ContextType + AnswerCertainty + AnswerPolarity | submission_id),  
    iter = 4000,
    prior = prior,
    data = d
  )
  saveRDS(fit_noGroup, "cachedModels/fit_noGroup.Rds")
} else{
  fit_noGroup <- readRDS("cachedModels/fit_noGroup.Rds")
}

And then comparing via LOO:

Code
# add LOO information
fit <- add_criterion(fit, "loo", model_name = "full")
fit_noGroup <- add_criterion(fit_noGroup, "loo", model_name = "w/o group")

# compare models by LOO
loo_compare(
  fit,   
  fit_noGroup
)  
            elpd_diff se_diff
fit          0.0       0.0   
fit_noGroup -6.9       8.3   

It appears that, when comparing trained models with REs, the lesioned model is slightly worse numerically, but not with a margin large enough to strongly prefer the full model. We take all of this as an indication that we may proceed (temporarily, in visualizations) with partial neglect of the trigger word distinction.

Effect of AnswerPolarity, AnswerCertainty and ContextType on relevance ratings

To investigate further which experimental factors influence the ratings of relevance of an answer, start by a visualization:

Code
d %>% 
  ggplot(aes(x = `relevance rating`, color = AnswerPolarity, fill = AnswerPolarity)) +
  facet_grid(AnswerCertainty ~ ContextType , scales = "free") +
  geom_density(alpha = 0.3) + theme_aida()

Let’s now look at a bunch of contrasts (based on the previously fitted full model).

Code
## expected ordering relation?
## non-answers vs low-certainty => poster = 1
nonAns_VS_low  <- compare_groups(
  fit,
  lower  = AnswerCertainty == "non_answer",
  higher = AnswerCertainty == "low_certainty"
)
## low-certainty vs high-certainty => poster = 0.9922
low_VS_high <- compare_groups(
  fit,
  lower  = AnswerCertainty == "low_certainty",
  higher = AnswerCertainty == "high_certainty"
)
## high-certainty vs exhaustive => poster = 1
high_VS_exh <- compare_groups(
  fit,
  lower  = AnswerCertainty == "high_certainty",
  higher = AnswerCertainty == "exhaustive"
)


# TODO check if that also holds for comparisons within each group

## effects of AnswerPolarity?
AnswerPolarity_main <- compare_groups(
  fit,
  lower  = AnswerPolarity == "positive" & AnswerCertainty != "non_answer",
  higher = AnswerPolarity == "negative" & AnswerCertainty != "non_answer"
)

AnswerPolarity_lowCertain <- compare_groups(
  fit,
  lower  = AnswerPolarity == "positive" & AnswerCertainty == "low_certainty",
  higher = AnswerPolarity == "negative" & AnswerCertainty == "low_certainty"
)

AnswerPolarity_highCertain <-compare_groups(
  fit,
  lower  = AnswerPolarity == "positive" & AnswerCertainty == "high_certainty",
  higher = AnswerPolarity == "negative" & AnswerCertainty == "high_certainty"
)

AnswerPolarity_exhaustive <-compare_groups(
  fit,
  lower  = AnswerPolarity == "positive" & AnswerCertainty == "exhaustive",
  higher = AnswerPolarity == "negative" & AnswerCertainty == "exhaustive"
)

ContextType_neutral <- 
  compare_groups(fit, higher = ContextType == "neutral", lower = ContextType != "neutral")

cellComparisons <- tribble(
  ~comparison, ~posterior, ~"95%HDI (low)", ~"95%HDI (high)",
  # "group: helpful < relevance" , group_main$post_prob, group_main$l_ci, group_main$u_ci,  
  "non-answer < low certainty" , nonAns_VS_low$post_prob, nonAns_VS_low$l_ci, nonAns_VS_low$u_ci,  
  "low certain < high certain" , low_VS_high$post_prob, low_VS_high$l_ci, low_VS_high$u_ci,  
  "high certain < exhaustive" , high_VS_exh$post_prob, high_VS_exh$l_ci, high_VS_exh$u_ci,  
  "answer: pos < neg", AnswerPolarity_main$post_prob, AnswerPolarity_main$l_ci, AnswerPolarity_main$u_ci,
  # "Polarity (low certain)", AnswerPolarity_lowCertain$post_prob, AnswerPolarity_lowCertain$l_ci, AnswerPolarity_lowCertain$u_ci,
  # "Polarity (high certain)", AnswerPolarity_highCertain$post_prob, AnswerPolarity_highCertain$l_ci, AnswerPolarity_highCertain$u_ci,
  # "Polarity (exhaustive)", AnswerPolarity_exhaustive$post_prob, AnswerPolarity_exhaustive$l_ci, AnswerPolarity_exhaustive$u_ci,
  "context: neutral > pos/neg", ContextType_neutral$post_prob, ContextType_neutral$l_ci, ContextType_neutral$u_ci
)

knitr::kable(cellComparisons)
comparison posterior 95%HDI (low) 95%HDI (high)
non-answer < low certainty 1.000000 0.4309597 0.5688066
low certain < high certain 0.997250 0.0452239 0.1968662
high certain < exhaustive 0.999875 0.0954769 0.2474595
answer: pos < neg 0.916375 -0.0143989 0.0830984
context: neutral > pos/neg 0.916875 -0.0084320 0.0578527

The table shows results indicating that there are (non-surprising) effects of AnswerType with non-answers rated as least relevant, followed by low-certainty, then high-certainty answers, and final exhaustive answers. There is no (strong) indication for a main effect of AnswerPolarity or ContextType. The lack of an effect of ContextType might be interpreted as prima facie evidence in favor of quantitative notions or relevance that do not take the prior into account (at least not very prominently).

Here is a plot of the relevant posterior draws visually supporting why we compared the three factor levels of ContextType in the way we did:

Code
draws_ContextType <- 
  tibble(
    extract_cell_draws(fit, ContextType == "positive", colname = "positive"),
    extract_cell_draws(fit, ContextType == "negative", colname = "negative"),
    extract_cell_draws(fit, ContextType == "neutral", colname = "neutral")
  ) %>% pivot_longer(cols = everything())

draws_ContextType %>% 
  ggplot(aes(x = value, color = name, fill = name)) +
  geom_density(alpha = 0.3)

Exploring QuaRels

In this section we explore which of the quantiative notions of relevance is a good predictor of the relevance judgements.

Preparing the data

We start by preparing the data for these analyises.

Correcting calculation of entropy reduction for beta-beliefs

It seems that there is a problem with the original calculation of entropy reduction for beta distributions: ER Beta is NA whenever at least one of the confidence ratings is 7. So, ER for beta is na if a person was very, very confident.

Code
d %>%
  mutate(check = is.na(entropy_reduction_beta) == (prior_confidence == 7 | 
                                                     posterior_confidence == 7)) %>%
  pull(check) %>% all()
[1] TRUE

ER values should not be NA when confidence levels are high, but rather just very low. Therefore, add new ER_beta value (and store individual entropyi of prior and posterior along the way).

Code
get_entropy_beta <- function(a, b) {
  # alpha is a vector of Dirichlet weights
  a0 <- a + b
  entropy <- log((gamma(a) * gamma(b)) / gamma(a0)) +
    (a0 - 2) * digamma(a0) -
    (a-1) * digamma(a) - (b-1) * digamma(b)
  if (!is.finite(entropy) | is.na(entropy)) {
    return(-5)
  } else {
    return(entropy)
  }
}

d <- d %>% 
  mutate(
    prior_entropy = map_dbl(1:nrow(d), function(i) get_entropy_beta(d$prior_beta_a[i], d$prior_beta_b[i])),
    posterior_entropy = map_dbl(1:nrow(d), function(i) get_entropy_beta(d$posterior_beta_a[i], d$posterior_beta_b[i])),
    ER_beta = abs(prior_entropy - posterior_entropy)
  )

Adding a measure for Bayes factors for beta-beliefs

We also want a version of BF for the beta distributions. There are different ways of doing this. There might not be a “right” way of doing this. It all depends on what we even mean by ‘Bayes factor’ in this context. Normally, a Bayes factor is the factor by which observation of data \(D\) changes the prior odds between models \(M_1\) and \(M_2\) to yield the posterior odds:

\[ \underbrace{\frac{P(M_1 \mid D)}{P(M_2 \mid D)}}_{\mathrm{posterior\ odds}} = \underbrace{\frac{P(D \mid M_1)}{P(D \mid M_2)}}_{\mathrm{Bayes\ factor}} \ \underbrace{\frac{P(M_1)}{P(M_2)}}_{\mathrm{prior\ odds}} \]

Let’s first look at the notion we called ‘Bayes factor’ for the first-order case. Let the prior be described by the prior probability \(p\) of the positive answer being true. Similarly, with \(q\) for the posterior. We defined the BF as the

\[ \underbrace{\frac{q}{1-q}}_{\mathrm{posterior\ odds}} = \underbrace{b}_{\mathrm{Bayes\ factor}} \ \underbrace{\frac{p}{1-p}}_{\mathrm{prior\ odds}} \]

This maps onto the usual definition of the Bayes factor by mapping the model \(M_1\) onto the categorical belief that the positive answer is true, and \(M_2\) to a model which categorically assumes that the negative answer is false. Or, in other words, the first-order case allows us to map the prior and posterior beliefs directly onto an odds-form, so that we can directly quantify a measure of change induced by some observation in terms of the product:

\[ b = \frac{q}{1-q} \frac{1-p}{p} \]

What would an analogy be for higher-order beta beliefs? - I don’t think that there is a direct analogy. We have to find another definition that captures the idea of the Bayes factor in the first equation, which is a measure of evidence inherent in an observation that would take us from prior to posterior.

Here are two suggestions. The first is an obvious candidate we might want to address, but get out of the way. The second is new. The third is what we had before (in presentations / slides).

BF beta from expectations

We could try to use the same approach as for first-order beliefs, just using the expected value for the truth of the positive / negative answer. If the prior is given by beta weights \(a_{prior}\) and \(b_{prior}\), the expected value of the truth of the positive answer is \(\frac{a_{prior}}{a_{prior} + b_{prior}}\); similarly, for the negative answer: \(\frac{b_{prior}}{a_{prior} + b_{prior}}\). Using this, we could define the BF-measure for beta beliefs as:

\[ b = \frac{b_{prior}}{a_{prior}} \ \frac{a_{posterior}}{b_{posterior}} \] However, it is clear that is is not what we want, because it would five us a neutral BF value of 1 whenever, for example, only the higher-order uncertainty shrinks while the expected values stay the same.

BF beta from sampling

The previous approach missed to bring the uncertainty in a beta belief to bear on the definition of our BF-like concept. We can get that back and stay true to the original first-order definition if we integrate: where previously we only had one true answer probability \(p\), we now want to use samples from prior and posterior (to approximate the integrals over). Concretely, we calculate the expectation of (the log of (because otherwise otherwise we cannot reasonably calculate a mean)):

\[b = \frac{q}{1-q}\frac{1-p}{p}\]

for samples \(p \sim Beta(a_{posterior},b_{posterior})\) and \(q \sim Beta(a_{posterior},b_{posterior})\).

In the implementation I’m currently using another log-transform (b/c of increased fit to the data).

Code
getExpectedBF <- function(prior_a, prior_b, posterior_a, posterior_b, nSamples) {
  p <- rbeta(nSamples, prior_a, prior_b)
  q <- rbeta(nSamples, posterior_a, posterior_b)
  exp(mean(log(q/(1-q) * (1-p)/p)))
}
d$BF_beta_sampled <- map_dbl(1:nrow(d), function(i){
  getExpectedBF(d[i,'prior_beta_a'], d[i,'prior_beta_b'],
                d[i,'posterior_beta_a'],d[i,'posterior_beta_b'], 5000) %>% 
    log() %>% abs() %>% log()
})

BF beta from comparing beta-weights

Finally we consider BF a measure of the strength of observational evidence to take us from one belief to another. Beta beliefs are defined by a pair of beta weights, \(a\) and \(b\), which can be interpreted as the number of observations of successes and failures, starting from a flat belief. The difference in observations that take an agent from a prior beta belief (with weights \(a_{prior}\) and \(b_{prior}\)) to another, can therefore be operationalized as:

\[ |a_{prior} - a_{posterior}| + |b_{prior} - b_{posterior}| \]

For scaling, we use the log of this measure (and scale it to take only unit values; which is innocent in the context of linear regression).

Code
d <- d %>% 
  mutate(
    # I don't know anymore why I added the +2 but it does seem to have a positive effect
    BF_beta = log(abs(prior_beta_a - posterior_beta_a) + 
                    abs(prior_beta_b - posterior_beta_b) + 2),
    BF_beta = BF_beta / max(BF_beta)
  )

Comparing the two different BF beta notions

Code
d %>% ggplot(aes(x = BF_beta, y = BF_beta_sampled)) +
  geom_point() + geom_smooth()
`geom_smooth()` using method = 'loess' and formula 'y ~ x'

Correlation scores and plots

For plotting and subsequent analyses we use some more uniform names:

Code
d <- d %>% 
  mutate(
    relevance = relevance_sliderResponse,
    ER = entropy_reduction,
    KL = kl_util,
    BF = exp_bayes_factor,
    KL_beta = kl_util_beta  
  )

Here is a first correlation plot:

Code
predictiveFactors <- c(
  "relevance",
  "ER",
  "KL",
  "BF",
  "ER_beta",
  "KL_beta",
  "BF_beta",
  "BF_beta_sampled"
)

predFactorMatrix <- d %>% select(all_of(predictiveFactors)) %>% as.matrix()

ggcorr(predFactorMatrix, palette = "RdBu")

And here is another:

Code
p <- ggpairs(d %>% select(all_of(predictiveFactors), AnswerCertainty),
        aes(color = AnswerCertainty)) + theme_aida()
for(i in 1:p$nrow) {
  for(j in 1:p$ncol){
    p[i,j] <- p[i,j] + 
      scale_fill_manual( values=project_colors) +
      scale_color_manual(values=project_colors)  
  }
}
p

And here is a scatter plot of for predictive measures and observed relevance ratings:

Code
d %>% select(predictiveFactors, AnswerCertainty) %>% 
  pivot_longer(cols = predictiveFactors[2:8], names_to = 'factor', values_to = "value") %>% 
  mutate(factor = factor(factor, levels = predictiveFactors[2:8])) %>% 
  ggplot(aes(x = value, y = relevance, color = AnswerCertainty)) +
  geom_point(alpha = 0.5) +
  # geom_smooth() +
  facet_wrap(. ~factor, scales = "free") +
  theme_aida()

Single-factor model comparisons

To compare explanatory factors, we look at regression models with each candidate factor as the single predictor. We compare these models from an /ex ante/ and from an /ex post/ perspective, using Bayes factors and LOO cross-validation respectively.

Bayes factor model comparison

For ease of computation, we can use the BayesFactor package. (This only provides limited use of random effects, if at all.)

Code
BFComp_rel_KL      <- lmBF(relevance_sliderResponse ~ KL, data = as.data.frame(d))
BFComp_rel_ER      <- lmBF(relevance_sliderResponse ~ ER, data = as.data.frame(d))
BFComp_rel_BF      <- lmBF(relevance_sliderResponse ~ BF, data = as.data.frame(d))
BFComp_rel_KL_beta <- lmBF(relevance_sliderResponse ~ KL_beta, data = as.data.frame(d))
BFComp_rel_ER_beta <- lmBF(relevance_sliderResponse ~ ER_beta, data = as.data.frame(d))
BFComp_rel_BF_beta <- lmBF(relevance_sliderResponse ~ BF_beta, data = as.data.frame(d))
BFComp_rel_BF_beta_sampled <- lmBF(relevance_sliderResponse ~ BF_beta_sampled, data = as.data.frame(d))

model_names <- c("ER", "KL", "BF", "ER_beta", "KL_beta", "BF_beta", "BF_beta_sampled")

tibble(
    ER      = BFComp_rel_ER      %>% as.vector(), 
    KL      = BFComp_rel_KL      %>% as.vector(), 
    BF      = BFComp_rel_BF      %>% as.vector(), 
    ER_beta = BFComp_rel_ER_beta %>% as.vector(), 
    KL_beta = BFComp_rel_KL_beta %>% as.vector(), 
    BF_beta = BFComp_rel_BF_beta %>% as.vector(), 
    BF_beta_sampled = BFComp_rel_BF_beta_sampled %>% as.vector() 
  ) %>% pivot_longer(everything(), names_to = "model", values_to = "relative_BF") %>% 
  mutate(
    # model = fct_reorder(model, relative_BF)
    model = factor(model, rev(model_names))
  ) %>% 
  ggplot(aes(x = model, y = relative_BF)) +
  scale_y_log10() +
  geom_col() +
  coord_flip() +
  xlab("") +
  ylab("log BF (relative to intercept only)")

We see that the BF-based notions are the best single-factor predictors. KL_beta is doing a job as well.

LOO-based model comparison

We use the functionality that ships with the brms package to calculate LOO-CV values via Pareto-smoothed importance sampling.

Code
# ER
fit_loo_ER <- brm(
  relevance_sliderResponse ~ ER,
  iter = 4000,
  prior = prior(student_t(1, 0, 25), class = "b"),
  sample_prior = T,
  data = d 
) %>% add_criterion("loo", model_name = "ER")

# KL
fit_loo_KL <- brm(
  relevance_sliderResponse ~ KL,
  iter = 4000,
  prior = prior(student_t(1, 0, 25), class = "b"),
  sample_prior = T,
  data = d 
) %>% add_criterion("loo", model_name = "KL")

# BF
fit_loo_BF <- brm(
  relevance_sliderResponse ~ BF,
  iter = 4000,
  prior = prior(student_t(1, 0, 25), class = "b"),
  sample_prior = T,
  data = d 
) %>% add_criterion("loo", model_name = "BF")

# ER_beta
fit_loo_ER_beta <- brm(
  relevance_sliderResponse ~ ER_beta,
  iter = 4000,
  prior = prior(student_t(1, 0, 25), class = "b"),
  sample_prior = T,
  data = d 
) %>% add_criterion("loo", model_name = "ER_beta")

# KL_beta
fit_loo_KL_beta <- brm(
  relevance_sliderResponse ~ KL_beta,
  iter = 4000,
  prior = prior(student_t(1, 0, 25), class = "b"),
  sample_prior = T,
  data = d 
) %>% add_criterion("loo", model_name = "KL_beta")

# BF_beta
fit_loo_BF_beta <- brm(
  relevance_sliderResponse ~ BF_beta,
  iter = 4000,
  prior = prior(student_t(1, 0, 25), class = "b"),
  sample_prior = T,
  data = d 
) %>% add_criterion("loo", model_name = "BF_beta")

# BF_beta (with sampling method)
fit_loo_BF_beta_sampled <- brm(
  relevance_sliderResponse ~ BF_beta_sampled,
  iter = 4000,
  prior = prior(student_t(1, 0, 25), class = "b"),
  sample_prior = T,
  data = d 
) %>% add_criterion("loo", model_name = "BF_beta_sampled")

loo_compare(
 fit_loo_ER, 
 fit_loo_KL, 
 fit_loo_BF,
 fit_loo_ER_beta, 
 fit_loo_KL_beta, 
 fit_loo_BF_beta,
 fit_loo_BF_beta_sampled
)
Code
model_names <- c("ER", "KL", "BF", "ER_beta", "KL_beta", "BF_beta", "BF_beta_sampled")
tibble(
  model = factor(model_names, levels = rev(model_names)),
  ELPD  = c(
    loo(fit_loo_ER)$estimates['elpd_loo','Estimate'], 
    loo(fit_loo_KL)$estimates['elpd_loo','Estimate'],
    loo(fit_loo_BF)$estimates['elpd_loo','Estimate'],
    loo(fit_loo_ER_beta)$estimates['elpd_loo','Estimate'],  
    loo(fit_loo_KL_beta)$estimates['elpd_loo','Estimate'], 
    loo(fit_loo_BF_beta)$estimates['elpd_loo','Estimate'],
    loo(fit_loo_BF_beta_sampled)$estimates['elpd_loo','Estimate'] 
  ),
  SE = c(
    loo(fit_loo_ER)$estimates['elpd_loo','SE'], 
    loo(fit_loo_KL)$estimates['elpd_loo','SE'],
    loo(fit_loo_BF)$estimates['elpd_loo','SE'],
    loo(fit_loo_ER_beta)$estimates['elpd_loo','SE'],  
    loo(fit_loo_KL_beta)$estimates['elpd_loo','SE'], 
    loo(fit_loo_BF_beta)$estimates['elpd_loo','SE'],
    loo(fit_loo_BF_beta_sampled)$estimates['elpd_loo','SE']
  ),
  lower = ELPD - SE,
  upper = ELPD + SE
) %>% 
  ggplot(aes( x = model , y = ELPD)) +
  geom_pointrange(aes(min = lower, max = upper)) +
  coord_flip() +
  xlab("") +
  ylab("expected log likelihood (LOO-CV) ")

Two-factor models

We can also compare measures against each other by pairing their first-order and their second-order version. Again we do this from an /ex ante/ and an /ex post/ perspective.

We here report the pairing of BF and BF_beta as BF, and the pairing of BF and BF_beta_sampled as BF_sampled.

We will see that the BF-based notions are better than ER or KL. There is no substantial difference between the two BF-based versions for the two-factor models. This holds for both manners of analysis: Bayes factors and LOO.

Bayes factor model comparison for two-factor models

Code
BFComp_KL_dbl <- lmBF(relevance_sliderResponse ~ KL * KL_beta, data = as.data.frame(d))
BFComp_ER_dbl <- lmBF(relevance_sliderResponse ~ ER * ER_beta, data = as.data.frame(d))
BFComp_BF_dbl <- lmBF(relevance_sliderResponse ~ BF * BF_beta, data = as.data.frame(d))
BFComp_BF_dbl_sampled <- lmBF(relevance_sliderResponse ~ BF * BF_beta_sampled, 
                              data = as.data.frame(d))

model_names <- c("ER", "KL", "BF", "BF_sampled")

tibble(
  KL      = BFComp_KL_dbl %>% as.vector(), 
  ER      = BFComp_ER_dbl %>% as.vector(), 
  BF      = BFComp_BF_dbl %>% as.vector(),
  BF_sampled = BFComp_BF_dbl_sampled %>% as.vector()
) %>% pivot_longer(everything(), names_to = "model", values_to = "relative_BF") %>% 
  mutate(
    # model = fct_reorder(model, relative_BF)
    model = factor(model, rev(model_names))
  ) %>% 
  ggplot(aes(x = model, y = relative_BF)) +
  scale_y_log10() +
  geom_col() +
  coord_flip() +
  xlab("") +
  ylab("log BF (relative to intercept only)")

LOO-based model comparison for two-factor models

Code
# ER
fit_loo_ER <- brm(
  relevance_sliderResponse ~ ER * ER_beta,
  iter = 4000,
  prior = prior(student_t(1, 0, 25), class = "b"),
  sample_prior = T,
  data = d 
) %>% add_criterion("loo", model_name = "ER")

# KL
fit_loo_KL <- brm(
  relevance_sliderResponse ~ KL * KL_beta,
  iter = 4000,
  prior = prior(student_t(1, 0, 25), class = "b"),
  sample_prior = T,
  data = d 
) %>% add_criterion("loo", model_name = "KL")

# BF
fit_loo_BF <- brm(
  relevance_sliderResponse ~ BF * BF_beta,
  iter = 4000,
  prior = prior(student_t(1, 0, 25), class = "b"),
  sample_prior = T,
  data = d 
) %>% add_criterion("loo", model_name = "BF")

# BF (with sampling)
fit_loo_BF_sampled <- brm(
  relevance_sliderResponse ~ BF * BF_beta_sampled,
  iter = 4000,
  prior = prior(student_t(1, 0, 25), class = "b"),
  sample_prior = T,
  data = d 
) %>% add_criterion("loo", model_name = "BF_sampled")
Code
model_names <- c("ER", "KL", "BF", "BF_sampled")

tibble(
  model = factor(model_names, levels = rev(model_names)),
  ELPD  = c(
    loo(fit_loo_ER)$estimates['elpd_loo','Estimate'], 
    loo(fit_loo_KL)$estimates['elpd_loo','Estimate'],
    loo(fit_loo_BF)$estimates['elpd_loo','Estimate'],
    loo(fit_loo_BF_sampled)$estimates['elpd_loo','Estimate']
  ),
  SE = c(
    loo(fit_loo_ER)$estimates['elpd_loo','SE'], 
    loo(fit_loo_KL)$estimates['elpd_loo','SE'],
    loo(fit_loo_BF)$estimates['elpd_loo','SE'],
    loo(fit_loo_BF_sampled)$estimates['elpd_loo','SE']
  ),
  lower = ELPD - SE,
  upper = ELPD + SE
) %>% 
  ggplot(aes( x = model , y = ELPD)) +
  geom_pointrange(aes(min = lower, max = upper)) +
  coord_flip() +
  xlab("") +
  ylab("expected log likelihood (LOO-CV) ")

TODO: add drop-factor comparison; add RE structures